Loading libraries

library(jpeg)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0           ✔ purrr   0.2.5      
## ✔ tibble  1.4.2           ✔ dplyr   0.7.99.9000
## ✔ tidyr   0.8.1           ✔ stringr 1.3.1      
## ✔ readr   1.1.1           ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(knitr)
library(janitor)
library(corrgram)
library(rpart)

Downloading images

img_url <- c("http://curso-r.com/img/blog/desafio-recuperacao-img/purple_wave.jpg",
           "http://curso-r.com/img/blog/desafio-recuperacao-img/xadrez_colorido.jpg")
img_name <- basename(img_url)

walk2(img_url,img_name,function(x,y){
  download.file(url = x,
                destfile = y)
})

Reading images using JPEG package into a list

images <- map(img_name,readJPEG) %>%
  set_names(janitor::clean_names(img_name))

Checking images dimension

map(images,dim)
## $purple_wave.jpg
## [1] 210 336   3
## 
## $xadrez_colorido.jpg
## [1] 161 180   3

Converting the jpeg to dataframes

# Function to generate dataframe from images
generate_df_from_jpeg <- function(img){
  data.frame(
    x = rep(1:dim(img)[2], each = dim(img)[1]),
    y = rep(dim(img)[1]:1, dim(img)[2]),
    r = as.vector(img[,,1]),
    g = as.vector(img[,,2]),
    b = as.vector(img[,,3])
  ) %>% 
    mutate(cor = rgb(r,g,b),
           id = row_number())
}

images_df <- map(images, generate_df_from_jpeg)

Split the dataset into train and test

# Function to split the dataset into train and test
split_train_test <- function(img) {
  ret <- list()
  
  ret$test <- img %>%
    sample_frac(2/5) %>%
    mutate(b_backup = b,
           b = 0,
           cor = rgb(r, g, b))
  
  ret$train <- img %>% 
    filter(!id%in%ret[1]$id)
  
  ret
}

images_train_test_df <- map(images_df,split_train_test)

Generating a corrgram based on the JPEG images

map(images_df, corrgram::corrgram)

## $purple_wave.jpg
##              x            y           r           g           b
## x   1.00000000  0.000000000 -0.06307728 -0.05403876 -0.06139958
## y   0.00000000  1.000000000  0.78886627  0.72207882  0.77601361
## r  -0.06307728  0.788866273  1.00000000  0.97757782  0.99884057
## g  -0.05403876  0.722078823  0.97757782  1.00000000  0.98506976
## b  -0.06139958  0.776013608  0.99884057  0.98506976  1.00000000
## id  0.99999557 -0.002976157 -0.06542479 -0.05618754 -0.06370885
##              id
## x   0.999995571
## y  -0.002976157
## r  -0.065424789
## g  -0.056187540
## b  -0.063708846
## id  1.000000000
## 
## $xadrez_colorido.jpg
##             x            y           r           g            b
## x  1.00000000  0.000000000  0.07117786  0.01943599  0.070323647
## y  0.00000000  1.000000000 -0.00145933 -0.00198190 -0.003784754
## r  0.07117786 -0.001459330  1.00000000  0.47843676  0.147598374
## g  0.01943599 -0.001981900  0.47843676  1.00000000 -0.056291052
## b  0.07032365 -0.003784754  0.14759837 -0.05629105  1.000000000
## id 0.99998457 -0.005555448  0.07118487  0.01944670  0.070343588
##              id
## x   0.999984568
## y  -0.005555448
## r   0.071184871
## g   0.019446701
## b   0.070343588
## id  1.000000000

The first JPEG, purple_wave.jpeg has a very high correlation between the colours and the axis x-y, what suggests a linear relationship. Otherwise, the second JPEG, xadrez_colorido.jpg doesn’t have that relatioship.

Something to worry about is the multicolinearity on the linear regression, as long as it seems that the independent values are highly correlated.

Generating the models with the formula b ~ x + y + r + g values.

images_lm <- map(images_train_test_df, ~ lm(formula = b ~ x + y + r + g, data = .$train))
images_cart <- map(images_train_test_df, ~ rpart(formula = b ~ x + y + r + g, data = .$train))

Predicting values based on the models

# Function to calculate the prediction
predict_images <- function(x,y){
  predict(x,y$test)
}

images_lm_predictions <- map2(images_lm,images_train_test_df, predict_images)
images_cart_predictions <- map2(images_cart, images_train_test_df, predict_images)

Evaluate the linear regression

map(images_lm,summary)
## $purple_wave.jpg
## 
## Call:
## lm(formula = b ~ x + y + r + g, data = .$train)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.040405 -0.002844 -0.000702  0.003174  0.039471 
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  6.814e-04  7.169e-05    9.504  < 2e-16 ***
## x            1.642e-06  2.775e-07    5.919 3.26e-09 ***
## y           -3.511e-05  7.787e-07  -45.092  < 2e-16 ***
## r            8.191e-01  5.426e-04 1509.611  < 2e-16 ***
## g            3.127e-01  8.115e-04  385.289  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007092 on 70555 degrees of freedom
## Multiple R-squared:  0.9994, Adjusted R-squared:  0.9994 
## F-statistic: 2.835e+07 on 4 and 70555 DF,  p-value: < 2.2e-16
## 
## 
## $xadrez_colorido.jpg
## 
## Call:
## lm(formula = b ~ x + y + r + g, data = .$train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32229 -0.15811 -0.12039 -0.02142  0.93542 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.106e-01  5.096e-03   21.70   <2e-16 ***
## x            3.520e-04  3.508e-05   10.03   <2e-16 ***
## y           -2.581e-05  3.912e-05   -0.66    0.509    
## r            1.639e-01  4.844e-03   33.84   <2e-16 ***
## g           -1.447e-01  5.784e-03  -25.02   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3095 on 28975 degrees of freedom
## Multiple R-squared:  0.046,  Adjusted R-squared:  0.04587 
## F-statistic: 349.3 on 4 and 28975 DF,  p-value: < 2.2e-16

The result of the linear regression for purple_wave.jpg seems very good. We got a Adjusted R-squared of 0.9994. On the other hand, for xadrez_colorido.jpg the result is not so good.

# Function to calcualte the Mean Absolute Error
MAE <- function(actual, predicted){
  mean(abs(actual$test$b - predicted))
}

map2(images_train_test_df,images_lm_predictions,MAE)
## $purple_wave.jpg
## [1] 0.2832589
## 
## $xadrez_colorido.jpg
## [1] 0.1717388

Considering that the values are from 0 to 1, the regression tree has a mean difference of 0.17 for xadrez_colorido.jpg, which is way better than the result for purple_wave.jpg

Assigning the predicted values to test dataframe.

images_train_test_df$purple_wave.jpg$test$b_lm <- images_lm_predictions$purple_wave.jpg
images_train_test_df$purple_wave.jpg$test$b_cart <- images_cart_predictions$purple_wave.jpg
images_train_test_df$xadrez_colorido.jpg$test$b_lm <- images_lm_predictions$xadrez_colorido.jpg
images_train_test_df$xadrez_colorido.jpg$test$b_cart <- images_cart_predictions$xadrez_colorido.jpg


# Function to generate the plot using tidyeval
generate_plot <- function(x, var_name){
  var_name <- enquo(var_name)
  
  # The linear model has predicted some values smaller than 0, which is not accepted by rgb function.
  df <- x$test %>%
  mutate(!!var_name := if_else(!!var_name < 0, 0 , !!var_name),
           cor = rgb(r,g,!!var_name))
  
  # Generating the plot
  ggplot(df) + 
    geom_point(aes(x,y), 
               colour = df$cor)
  
}

Finally, let’s visualise the recovered images.

Linear Regression

map(images_train_test_df,generate_plot,var_name = b_lm)
## $purple_wave.jpg

## 
## $xadrez_colorido.jpg

Using the predicted values by the Linear Regression, the purple wave was satisfactorily recovered. Whilst for “xadrez colorido”, the linear model seems to have failed in predict the colour correctly. It’s not possible to see any tone of blue on the image.

Regression Trees - CART

map(images_train_test_df,generate_plot,var_name = b_cart)
## $purple_wave.jpg

## 
## $xadrez_colorido.jpg

Regarding “purple wave”, the values predicted by the CART are also visually good on the prediction but the colours are not smooth as the results from the Linear Regression. On the other hand, the CART model predicted satisfactorily the colour tones for “xadrez colorigo”.

Wrapping up

Due to non linear relationship between independent and dependent variables, the image “xadrez colorido” has a better prediction result with the regression tree model.